# Instituto de Estudos Sociais e Pol�ticos (IESP)
# Universidade do Estado do Rio de Janeiro (UERJ)

# Script de modelagem de topicos do artigo "The Gender Division of Labor in Brazilian Political Science Publications".
# Autoria do script: Luiz Augusto Campos, Fernando Guarnieri e Thiago Moreira
# Autoria do artigo: Marcia Rangel C�ndido, Luiz Augusto Campos, Jo�o Feres J�nior
# O artigo ser� publicado na Brazilian Political Science Review
# 
# O script opera em 6 etapas

# 1) Prepra��o

## Instalar todos os pacotes a seguir
install.packages("tm", dependencies=TRUE)
install.packages("stringr", dependencies=TRUE)
install.packages("topicmodels", dependencies=TRUE)
install.packages("SnowballC", dependencies=TRUE)
install.packages("parallel", dependencies=TRUE)
install.packages("ggplot2", dependencies=TRUE)
install.packages("tidytext", dependencies=TRUE)
install.packages("ldatuning", dependencies=TRUE)
install.packages("quanteda", dependencies=TRUE)


## Rodar todos os pacotes a seguir
library(tm)
library(ggplot2)
library(stringr)
library(topicmodels)
library(SnowballC)
library(ldatuning)
library(parallel)
library(tidytext)
library(quanteda)


## Ativar uso de todos os cores do processador

options(mc.cores=parallel::detectCores())

setwd("Diretorio dos Dados")

dado <- read.csv2("base_Scielo_CP_2005-2018_Resumos.csv")

# 2) Limpeza do corpus

## Limpa o texto

mat <- Corpus(VectorSource(dado$resumo_en))
mat <- tm_map(mat, stripWhitespace) #remove espa�os em branco
mat <- tm_map(mat, content_transformer(tolower))
mat <- tm_map(mat, removeWords, stopwords("english")) #remove termos instrumentais
mat <- tm_map(mat, removePunctuation) #remoce pontua��o
mat <- tm_map(mat, removeNumbers) #remove n�meros
mat <- tm_map(mat, stemDocument) #radicaliza os termos

# deleta determinados termos recorrentes
mat <- tm_map(mat, removeWords, c("polit","defin","motiv","note","therefor","becom","appear","may","continu","fact","long","beyond","without","back","reach","taken","today","seen","past","work","thesi","product","field","chang","purpos","event","effect","reflect","look","build","associ","act","held","support","accord","design","path","oper","final","sector","promot","pass","need","certain","tri","anoth","remain","charact","sourc","conting","consolid","give","articul","link","implic","side","line","around","recent","especi","condit","connect","due","intend","reduc","face", "character","deal","principl","relev","bodi","other","limit","concern","come","disciplin","featur","toward","explor","reveal","influenc", "pattern", "involv", "lead", "find", "strong", "great", "mark", "characterist", "emphas", "even", "generat", "howev", "identifi", "last", "made", "moment", "search", "sens", "special", "view", "analys", "analyt", "area", "argument", "aspect", "base", "book", "brazil", "brazilian", "call", "centuri", "challeng", "collect", "complex", "conflict", "constitut", "construct", "contemporari", "context", "contribut", "countri", "current", "debat", "decad", "differ", "dimens", "direct", "distinct", "element", "emerg", "end", "establish", "experi", "express", "focus", "format", "found", "framework", "function", "general", "group", "hand", "histor", "human", "idea", "impact", "import", "individu", "inform", "institut", "interest", "intern", "interpret", "investig", "latin", "life", "live", "make", "mean", "model", "modern", "network", "new", "notion", "object", "organ", "origin", "particular", "peopl", "period", "perspect", "popul", "posit", "possibl", "potenti", "power", "practic", "present", "problem", "process", "project", "ration", "realiti", "refer", "reform", "regard", "relationship", "repres", "represent", "research", "resourc", "respons", "role", "rule", "second", "sever", "sinc", "situat", "social", "societi", "sociolog", "specif", "sphere", "start", "strategi", "subject", "survey", "system", "technolog", "text", "theme", "tradit", "trajectori", "transform", "understand", "univers", "well", "will", "world", "year", "articl", "author", "relat", "studi", "analyz", "discuss", "result", "two", "within", "first", "also", "paper", "examin", "approach","order", "show", "main", "can", "propos", "question", "point", "central", "form", "seek", "suggest", "three", "way", "aim", "allow", "among", "argu", "exist", "explain", "extent", "given", "highlight", "include", "issu", "like", "observ", "occur", "one", "provid", "set", "take", "thus", "upon", "use", "analysi", "case","level", "consid", "term", "includ", "various", "includ"))

# Cria DTM

dtm <- DocumentTermMatrix(mat)
removeSparseTerms(dtm, 0.9)

# 3) Visualiza��o da base

## gera vetor com termos mais frequentes
termFreq <- colSums(as.matrix(dtm))

## mostra os termos mais frequente (20)
termFreq[order(termFreq, decreasing = T)][1:100]


# 4) Estima��o do n�mero (k) de t�picos

## mudan�a para o formato "dfm".

final <- tidy(dtm)
dfm <- dfm(final$term) 

## estima��o do n�mero de t�picos de acordo com o melhor ajuste (consome muita mem�ria e algumas horas)

otimiza_soc <- FindTopicsNumber(
  dfm,
  topics = seq(from = 5, to = 50, by = 1),
  metrics = c("CaoJuan2009"),
  method = "Gibbs",
  control = list(seed = 2017),
  mc.cores = 4L,
  verbose = TRUE)

FindTopicsNumber_plot(otimiza_soc) #o resultado da estima��o encontra-se plotado no arquivo output_RplotKdeTopicos.pdf


# 5) Roda a modelagem de t�picos de acordo com o pacote LDA

## Parametros para o Gibbs

burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE

## Define o n�mero (k) de topicos

k <- 20

## Roda o LDA com os parametros para o Gibbs

ldaOut <-LDA(dtm,k, method = "Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))

#freq <- rowSums(as.matrix(dtm))
#freq[freq == 0]
dtm <- dtm[-685,]

#exporta lista de documentos com seus respectivos radicais (tolkens)

lemas <- mat$content

write.csv2((lemas), "lemas.csv", fileEncoding = "UTF-8")

# 6) Produz as sa�das (outputs) 

## Planilha com a imputa��o de t�picos de cada doc
lda.topics <- as.matrix(topics(ldaOut))
write.csv2(topics(ldaOut), "output_DocsToTopics.csv", fileEncoding = "UTF-8")

## Planilha com os 20 termos mais comuns dos k t�picos
lda.terms <- as.matrix(terms(ldaOut,20))
write.csv2(terms(ldaOut,20), "output_20termsToTopicos.csv", fileEncoding = "UTF-8")

## Planilha com a probabilidade de cada doc se associar a cada t�pico
topicProbs <- as.data.frame(ldaOut@gamma)
write.csv2(topicProbs, "output_topicProbs.csv")


